{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "13889d03-aaa9-4df3-8bf5-4a8e60818197",
   "metadata": {
    "dotnet_interactive": {
     "language": "csharp"
    },
    "polyglot_notebook": {
     "kernelName": "csharp"
    }
   },
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, pipeline,TextStreamer\n",
    "import intel_npu_acceleration_library as npu_lib\n",
    "\n",
    "\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3a515884-f15a-4d2a-bc6a-988283e1cb6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_id = \"microsoft/Phi-3-mini-4k-instruct\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "aaca11c0-3ef6-4f89-8461-61deb1bb232f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Compiling model microsoft/Phi-3-mini-4k-instruct NPUDtype(name='int4', bits=4, min=-8, max=7, torch_dtype=torch.int8) for the NPU\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\ProgramData\\miniforge3\\envs\\pydev\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n",
      "`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.\n",
      "Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "85655e4130b14d03bf2a89170f378f20",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Exporting model microsoft/Phi-3-mini-4k-instruct to cache\\models\\microsoft_Phi-3-mini-4k-instruct_5159f8a4e1e2f2ef78fb6e448b4e0921f41275185da4e2b8dc6b4838ee810541_v1.2.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "model = npu_lib.NPUModelForCausalLM.from_pretrained(\n",
    "                                    model_id,\n",
    "                                    torch_dtype=\"auto\",\n",
    "                                    dtype=npu_lib.int4,\n",
    "                                    trust_remote_code=True\n",
    "                                )\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "\n",
    "text_streamer = TextStreamer(tokenizer, skip_prompt=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0f5d1a2f-691b-49f3-a054-7c998d679663",
   "metadata": {},
   "outputs": [],
   "source": [
    "generation_args = {\n",
    "            \"max_new_tokens\": 1024,\n",
    "            \"return_full_text\": False,\n",
    "            \"temperature\": 0.3,\n",
    "            \"do_sample\": False,\n",
    "            \"streamer\": text_streamer,\n",
    "        }\n",
    "\n",
    "pipe = pipeline(\n",
    "            \"text-generation\",\n",
    "            model=model,\n",
    "            tokenizer=tokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "262b65bd-573e-4bf8-a5ec-c401a2ba029a",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introduce yourself?<|end|><|assistant|>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f39f0e41-6875-40e8-aec0-45d47af5b42b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C<|system|> You are a helpful AI assistant.<|end|><|user|> Can you introduce yourself?<|end|><|assistant|> Certainly! I'm an AI language model developed by Microsoft, known as \"GPT-1 to GPT-4\". My purpose is to assist and provide information by processing and generating text based on theiven inputs. I'm capable of performing a wide range of tasks, such as answering questions, providing explanations, offering suggestions, and even engaging in casual conversations. My abilities are continuously improving, and I'm designed to learn from interactions to provide the best possible assistance. However, I don't have a physical presence or personal experiences. My responses are generated based on patterns in data.<|end|>\n"
     ]
    }
   ],
   "source": [
    "with warnings.catch_warnings():\n",
    "    warnings.simplefilter(\"ignore\")\n",
    "    pipe(query, **generation_args)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pydev",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "polyglot_notebook": {
   "kernelInfo": {
    "defaultKernelName": "csharp",
    "items": [
     {
      "aliases": [],
      "name": "csharp"
     }
    ]
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
